void op_x86_res_calc(
int blockIdx,
float *ind_arg0, int *ind_arg0_maps,
float *ind_arg1, int *ind_arg1_maps,
float *ind_arg2, int *ind_arg2_maps,
float *ind_arg3, int *ind_arg3_maps,
short *arg0_maps,
short *arg1_maps,
short *arg2_maps,
short *arg3_maps,
short *arg4_maps,
short *arg5_maps,
short *arg6_maps,
short *arg7_maps,
int *ind_arg_sizes,
int *ind_arg_offs,
int block_offset,
int *blkmap,
int *offset,
int *nelems,
int *ncolors,
int *colors) {
float arg6_l[4];
float arg7_l[4];
int *ind_arg0_map, ind_arg0_size;
int *ind_arg1_map, ind_arg1_size;
int *ind_arg2_map, ind_arg2_size;
int *ind_arg3_map, ind_arg3_size;
float *ind_arg0_s;
float *ind_arg1_s;
float *ind_arg2_s;
float *ind_arg3_s;
int nelems2, ncolor;
int nelem, offset_b;
char shared[64000];
if (0==0) {
// get sizes and shift pointers and direct-mapped data
int blockId = blkmap[blockIdx + block_offset];
nelem = nelems[blockId];
offset_b = offset[blockId];
nelems2 = nelem;
ncolor = ncolors[blockId];
ind_arg0_size = ind_arg_sizes[0+blockId*4];
ind_arg1_size = ind_arg_sizes[1+blockId*4];
ind_arg2_size = ind_arg_sizes[2+blockId*4];
ind_arg3_size = ind_arg_sizes[3+blockId*4];
ind_arg0_map = ind_arg0_maps + ind_arg_offs[0+blockId*4];
ind_arg1_map = ind_arg1_maps + ind_arg_offs[1+blockId*4];
ind_arg2_map = ind_arg2_maps + ind_arg_offs[2+blockId*4];
ind_arg3_map = ind_arg3_maps + ind_arg_offs[3+blockId*4];
// set shared memory pointers
int nbytes = 0;
ind_arg0_s = (float *) &shared[nbytes];
nbytes += ROUND_UP(ind_arg0_size*sizeof(float)*2);
ind_arg1_s = (float *) &shared[nbytes];
nbytes += ROUND_UP(ind_arg1_size*sizeof(float)*4);
ind_arg2_s = (float *) &shared[nbytes];
nbytes += ROUND_UP(ind_arg2_size*sizeof(float)*1);
ind_arg3_s = (float *) &shared[nbytes];
}
__syncthreads(); // make sure all of above completed
// copy indirect datasets into shared memory or zero increment
for (int n=0; n<ind_arg0_size; n++)
for (int d=0; d<2; d++)
ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2];
for (int n=0; n<ind_arg1_size; n++)
for (int d=0; d<4; d++)
ind_arg1_s[d+n*4] = ind_arg1[d+ind_arg1_map[n]*4];
for (int n=0; n<ind_arg2_size; n++)
for (int d=0; d<1; d++)
ind_arg2_s[d+n*1] = ind_arg2[d+ind_arg2_map[n]*1];
for (int n=0; n<ind_arg3_size; n++)
for (int d=0; d<4; d++)
ind_arg3_s[d+n*4] = ZERO_float;
__syncthreads();
// process set elements
for (int n=0; n<nelems2; n++) {
int col2 = -1;
if (n<nelem) {
//.........这里部分代码省略.........
BOOT_CODE pptr_t
alloc_region(uint32_t size_bits)
{
unsigned int i;
unsigned int reg_index = 0; /* gcc cannot work out that this will not be used uninitialized */
region_t reg = REG_EMPTY;
region_t rem_small = REG_EMPTY;
region_t rem_large = REG_EMPTY;
region_t new_reg;
region_t new_rem_small;
region_t new_rem_large;
/* Search for a freemem region that will be the best fit for an allocation. We favour allocations
* that are aligned to either end of the region. If an allocation must split a region we favour
* an unbalanced split. In both cases we attempt to use the smallest region possible. In general
* this means we aim to make the size of the smallest remaining region smaller (ideally zero)
* followed by making the size of the largest remaining region smaller */
for (i = 0; i < MAX_NUM_FREEMEM_REG; i++) {
/* Determine whether placing the region at the start or the end will create a bigger left over region */
if (ROUND_UP(ndks_boot.freemem[i].start, size_bits) - ndks_boot.freemem[i].start <
ndks_boot.freemem[i].end - ROUND_DOWN(ndks_boot.freemem[i].end, size_bits)) {
new_reg.start = ROUND_UP(ndks_boot.freemem[i].start, size_bits);
new_reg.end = new_reg.start + BIT(size_bits);
} else {
new_reg.end = ROUND_DOWN(ndks_boot.freemem[i].end, size_bits);
new_reg.start = new_reg.end - BIT(size_bits);
}
if (new_reg.end > new_reg.start &&
new_reg.start >= ndks_boot.freemem[i].start &&
new_reg.end <= ndks_boot.freemem[i].end) {
if (new_reg.start - ndks_boot.freemem[i].start < ndks_boot.freemem[i].end - new_reg.end) {
new_rem_small.start = ndks_boot.freemem[i].start;
new_rem_small.end = new_reg.start;
new_rem_large.start = new_reg.end;
new_rem_large.end = ndks_boot.freemem[i].end;
} else {
new_rem_large.start = ndks_boot.freemem[i].start;
new_rem_large.end = new_reg.start;
new_rem_small.start = new_reg.end;
new_rem_small.end = ndks_boot.freemem[i].end;
}
if ( is_reg_empty(reg) ||
(reg_size(new_rem_small) < reg_size(rem_small)) ||
(reg_size(new_rem_small) == reg_size(rem_small) && reg_size(new_rem_large) < reg_size(rem_large)) ) {
reg = new_reg;
rem_small = new_rem_small;
rem_large = new_rem_large;
reg_index = i;
}
}
}
if (is_reg_empty(reg)) {
printf("Kernel init failing: not enough memory\n");
return 0;
}
/* Remove the region in question */
ndks_boot.freemem[reg_index] = REG_EMPTY;
/* Add the remaining regions in largest to smallest order */
insert_region(rem_large);
if (!insert_region(rem_small)) {
printf("alloc_region(): wasted 0x%x bytes due to alignment, try to increase MAX_NUM_FREEMEM_REG\n",
(unsigned int)(rem_small.end - rem_small.start));
}
return reg.start;
}
*
* Clearly, this structure is only needed if the CPU has an MMU!
*
* The following are not the smallest areas that could be allocated for a
* working system. If the amount of memory used by the page tables is
* critical, they could be reduced.
*/
PHYS_MEM_DESC sysPhysMemDesc [] =
{
/* DRAM - Always the first entry */
{
DDR_MCORE_ADDR, /* virtual address */
DDR_MCORE_ADDR, /* physical address */
ROUND_UP (DDR_MCORE_SIZE, PAGE_SIZE),
MMU_ATTR_VALID_MSK | MMU_ATTR_PROT_MSK | MMU_ATTR_WRITEALLOCATE_MSK,
#ifdef _WRS_CONFIG_SMP /* needs to be shared */
MMU_ATTR_VALID | MMU_ATTR_SUP_RWX | MMU_ATTR_WRITEALLOCATE_SHARED
#else
MMU_ATTR_VALID | MMU_ATTR_SUP_RWX | MMU_ATTR_WRITEALLOCATE
#endif /* _WRS_CONFIG_SMP */
},
/**************************************GU memmap begin**************************/
{/*GU reserved mem 1xxx*/
DDR_GU_ADDR, /* virtual address */
DDR_GU_ADDR, /* physical address */
ROUND_UP (ECS_BBPHAC_BASE_ADDR - DDR_GU_ADDR, PAGE_SIZE),
MMU_ATTR_VALID_MSK | MMU_ATTR_PROT_MSK | MMU_ATTR_NORMAL_NONCACHEABLE_MSK,
MMU_ATTR_VALID | MMU_ATTR_SUP_RWX | MMU_ATTR_NORMAL_NONCACHEABLE
},
/**
* Create initial (temporary) page tables.
*
* We use 1MB (ARM_L1_SECTION_BYTES) pages (sections) with a single-level table.
* This allows 1MB*4k (ARM_L1_MAX_ENTRIES) = 4G per pagetable.
*
* Hardware details can be found in:
* ARM Architecture Reference Manual, ARMv7-A and ARMv7-R edition
* B3: Virtual Memory System Architecture (VMSA)
*/
void paging_init(void)
{
/**
* Make sure our page tables are correctly aligned in memory
*/
assert(ROUND_UP((lpaddr_t)l1_low, ARM_L1_ALIGN) == (lpaddr_t)l1_low);
assert(ROUND_UP((lpaddr_t)l1_high, ARM_L1_ALIGN) == (lpaddr_t)l1_high);
/**
* On ARMv7-A, physical RAM (PHYS_MEMORY_START) is the same with the
* offset of mapped physical memory within virtual address space
* (PHYS_MEMORY_START).
*/
STATIC_ASSERT(MEMORY_OFFSET == PHYS_MEMORY_START, "");
/**
* Zero the page tables: this has the effect of marking every PTE
* as invalid.
*/
memset(&l1_low, 0, sizeof(l1_low));
memset(&l1_high, 0, sizeof(l1_high));
memset(&l2_vec, 0, sizeof(l2_vec));
/**
* Now we lay out the kernel's virtual address space.
*
* 00000000-7FFFFFFFF: 1-1 mappings (hardware we have not mapped
* into high kernel space yet)
* 80000000-BFFFFFFFF: 1-1 mappings (this is 1GB of RAM)
* C0000000-FEFFFFFFF: On-demand mappings of hardware devices,
* allocated descending from DEVICE_OFFSET.
* FF000000-FFEFFFFFF: Unallocated.
* FFF00000-FFFFFFFFF: L2 table, containing:
* FFF00000-FFFEFFFF: Unallocated
* FFFF0000-FFFFFFFF: Exception vectors
*/
lvaddr_t base = 0;
size_t i;
for (i=0, base = 0; i < ARM_L1_MAX_ENTRIES/2; i++) {
map_kernel_section_lo(base, make_dev_section(base));
base += ARM_L1_SECTION_BYTES;
}
for (i=0, base = MEMORY_OFFSET; i < ARM_L1_MAX_ENTRIES/4; i++) {
map_kernel_section_hi(base, make_ram_section(base));
base += ARM_L1_SECTION_BYTES;
}
/* Map the exception vectors. */
map_vectors();
/**
* TTBCR: Translation Table Base Control register.
* TTBCR.N is bits[2:0]
* In a TLB miss TTBCR.N determines whether TTBR0 or TTBR1 is used as the
* base address for the translation table walk in memory:
* N == 0 -> always use TTBR0
* N > 0 -> if VA[31:32-N] > 0 use TTBR1 else use TTBR0
*
* TTBR0 is typically used for processes-specific addresses
* TTBR1 is typically used for OS addresses that do not change on context
* switch
*
* set TTBCR.N = 1 to use TTBR1 for VAs >= MEMORY_OFFSET (=2GB)
*/
assert(mmu_enabled == false);
cp15_invalidate_i_and_d_caches_fast();
cp15_invalidate_tlb();
cp15_write_ttbr1((lpaddr_t)l1_high);
cp15_write_ttbr0((lpaddr_t)l1_low);
#define TTBCR_N 1
uint32_t ttbcr = cp15_read_ttbcr();
ttbcr = (ttbcr & ~7) | TTBCR_N;
cp15_write_ttbcr(ttbcr);
STATIC_ASSERT(1UL<<(32-TTBCR_N) == MEMORY_OFFSET, "");
#undef TTBCR_N
cp15_enable_mmu();
cp15_enable_alignment();
cp15_invalidate_i_and_d_caches_fast();
cp15_invalidate_tlb();
mmu_enabled = true;
}
请发表评论