Kernel/sched.c

26041 /* 26042 * linux/kernel/sched.c 26043 * 26044 * Copyright (C) 1991, 1992 Linus Torvalds 26045 * 26046 * 1996-12-23 Modified by Dave Grothe to fix bugs in 26047 * semaphores and make semaphores SMP safe 26048 * 1997-01-28 Modified by Finn Arne Gangstad to make 26049 * timers scale better. 26050 * 1997-09-10 Updated NTP code according to technical 26051 * memorandum Jan '96 "A Kernel Model for Precision 26052 * Timekeeping" by Dave Mills 26053 * 1998-11-19 Implemented schedule_timeout() and related 26054 * stuff by Andrea Arcangeli 26055 * 1998-12-24 Fixed a xtime SMP race (we need the 26056 * xtime_lock rw spinlock to serialize accesses to 26057 * xtime/lost_ticks). Copyright (C) 1998 Andrea 26058 * Arcangeli 26059 * 1998-12-28 Implemented better SMP scheduling by Ingo 26060 * Molnar 26061 * 1999-03-10 Improved NTP compatibility by Ulrich Windl 26062 */ 26063 26064 /* 'sched.c' is the main kernel file. It contains 26065 * scheduling primitives (sleep_on, wakeup, schedule etc) 26066 * as well as a number of simple system call functions 26067 * (type getpid()), which just extract a field from 26068 * current-task */ 26069 26070 #include <linux/mm.h> 26071 #include <linux/kernel_stat.h> 26072 #include <linux/fdreg.h> 26073 #include <linux/delay.h> 26074 #include <linux/interrupt.h> 26075 #include <linux/smp_lock.h> 26076 #include <linux/init.h> 26077 26078 #include <asm/io.h> 26079 #include <asm/uaccess.h> 26080 #include <asm/pgtable.h> 26081 #include <asm/mmu_context.h> 26082 #include <asm/semaphore-helper.h> 26083 26084 #include <linux/timex.h> 26085 26086 /* kernel variables */ 26087 26088 /* systemwide security settings */ 26089 unsigned securebits = SECUREBITS_DEFAULT; 26090 26091 /* timer interrupt period */ 26092 long tick = (1000000 + HZ/2) / HZ; 26093 26094 /* The current time */ 26095 volatile struct timeval 26096 xtime __attribute__ ((aligned (16))); 26097 26098 /* Don't completely fail for HZ > 500. */ 26099 int tickadj = 500/HZ ? : 1; /* microsecs */ 26100 26101 DECLARE_TASK_QUEUE(tq_timer); 26102 DECLARE_TASK_QUEUE(tq_immediate); 26103 DECLARE_TASK_QUEUE(tq_scheduler); 26104 26105 /* phase-lock loop variables */ 26106 /* TIME_ERROR prevents overwriting the CMOS clock */ 26107 /* clock synchronization status */ 26108 int time_state = TIME_OK; 26109 /* clock status bits */ 26110 int time_status = STA_UNSYNC; 26111 /* time adjustment (us) */ 26112 long time_offset = 0; 26113 /* pll time constant */ 26114 long time_constant = 2; 26115 /* frequency tolerance (ppm) */ 26116 long time_tolerance = MAXFREQ; 26117 /* clock precision (us) */ 26118 long time_precision = 1; 26119 /* maximum error (us) */ 26120 long time_maxerror = NTP_PHASE_LIMIT; 26121 /* estimated error (us) */ 26122 long time_esterror = NTP_PHASE_LIMIT; 26123 /* phase offset (scaled us) */ 26124 long time_phase = 0; 26125 /* frequency offset (scaled ppm) */ 26126 long time_freq = 26127 ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; 26128 /* tick adjust (scaled 1 / HZ) */ 26129 long time_adj = 0; 26130 /* time at last adjustment (s) */ 26131 long time_reftime = 0; 26132 26133 long time_adjust = 0; 26134 long time_adjust_step = 0; 26135 26136 unsigned long event = 0; 26137 26138 extern int do_setitimer(int, struct itimerval *, 26139 struct itimerval *); 26140 unsigned int * prof_buffer = NULL; 26141 unsigned long prof_len = 0; 26142 unsigned long prof_shift = 0; 26143 26144 extern void mem_use(void); 26145 26146 unsigned long volatile jiffies=0; 26147 26148 /* Init task must be ok at boot for the ix86 as we will 26149 * check its signals via the SMP irq return path. */ 26150 struct task_struct * task[NR_TASKS] = {&init_task, }; 26151 26152 struct kernel_stat kstat = { 0 }; 26153 26154 void scheduling_functions_start_here(void) { } 26155 26156 #ifdef __SMP__

26157 static void reschedule_idle_slow(struct task_struct * p) 26158 { 26159 /* (see reschedule_idle() for an explanation first ...) 26160 * 26161 * Pass #2 26162 * 26163 * We try to find another (idle) CPU for this woken-up 26164 * process. 26165 * 26166 * On SMP, we mostly try to see if the CPU the task used 26167 * to run on is idle.. but we will use another idle CPU 26168 * too, at this point we already know that this CPU is 26169 * not willing to reschedule in the near future. 26170 * 26171 * An idle CPU is definitely wasted, especially if this 26172 * CPU is running long-timeslice processes. The following 26173 * algorithm is pretty good at finding the best idle CPU 26174 * to send this process to. 26175 * 26176 * [We can try to preempt low-priority processes on other 26177 * CPUs in 2.3. Also we can try to use the avg_slice 26178 * value to predict 'likely reschedule' events even on 26179 * other CPUs.] */ 26180 int best_cpu = p->processor, 26181 this_cpu = smp_processor_id(); 26182 struct task_struct **idle = task, *tsk, *target_tsk; 26183 int i = smp_num_cpus; 26184 26185 target_tsk = NULL; 26186 do { 26187 tsk = *idle; 26188 idle++; 26189 if (tsk->has_cpu) { 26190 if (tsk->processor == this_cpu) 26191 continue;

26192 target_tsk = tsk; 26193 if (tsk->processor == best_cpu) { 26194 /* bingo, we couldn't get a better CPU, activate 26195 * it. */ 26196 goto send; /* this one helps GCC ... */ 26197 } 26198 } 26199 } while (--i > 0); 26200 26201 /* found any idle CPU? */ 26202 if (target_tsk) { 26203 send: 26204 target_tsk->need_resched = 1; 26205 smp_send_reschedule(target_tsk->processor); 26206 return; 26207 } 26208 } 26209 #endif /* __SMP__ */ 26210 26211 /* If there is a dependency between p1 and p2, don't be 26212 * too eager to go into the slow schedule. In 26213 * particular, if p1 and p2 both want the kernel lock, 26214 * there is no point in trying to make them extremely 26215 * parallel.. 26216 * 26217 * (No lock - lock_depth < 0) */ 26218 #define related(p1,p2) \ 26219 ((p1)->lock_depth >= 0 && (p2)->lock_depth >= 0) 26220

26221 static inline void reschedule_idle( 26222 struct task_struct * p) 26223 { 26224 26225 if (p->policy != SCHED_OTHER 26226 p->counter > current->counter + 3) { 26227 current->need_resched = 1; 26228 return; 26229 } 26230 26231 #ifdef __SMP__ 26232 /* ("wakeup()" should not be called before we've 26233 * initialized SMP completely. Basically a not-yet 26234 * initialized SMP subsystem can be considered as a 26235 * not-yet working scheduler, simply dont use it before 26236 * it's up and running ...) 26237 * 26238 * SMP rescheduling is done in 2 passes: 26239 * - pass #1: faster: quick decisions 26240 * - pass #2: slower: let's try to find another CPU */ 26241 26242 /* Pass #1 26243 * 26244 * There are two metrics here: 26245 * 26246 * first, a 'cutoff' interval, currently 0-200 usecs on 26247 * x86 CPUs, depending on the size of the 'SMP-local 26248 * cache'. If the current process has longer average 26249 * timeslices than this, then we utilize the idle CPU. 26250 * 26251 * second, if the wakeup comes from a process context, 26252 * then the two processes are 'related'. (they form a 26253 * 'gang') 26254 * 26255 * An idle CPU is almost always a bad thing, thus we 26256 * skip the idle-CPU utilization only if both these 26257 * conditions are true. (ie. a 'process-gang' 26258 * rescheduling with rather high frequency should stay 26259 * on the same CPU). 26260 * 26261 * [We can switch to something more finegrained in 26262 * 2.3.] */ 26263 if ((current->avg_slice < cacheflush_time) && 26264 related(current, p)) 26265 return; 26266 26267 reschedule_idle_slow(p); 26268 #endif /* __SMP__ */ 26269 } 26270 26271 /* Careful! 26272 * 26273 * This has to add the process to the _beginning_ of the 26274 * run-queue, not the end. See the comment about "This is 26275 * subtle" in the scheduler proper.. */ 26276 static inline void add_to_runqueue(struct task_struct *p) 26277 { 26278 struct task_struct *next = init_task.next_run; 26279 26280 p->prev_run = &init_task; 26281 init_task.next_run = p; 26282 p->next_run = next; 26283 next->prev_run = p; 26284 nr_running++; 26285 } 26286 26287 static inline void del_from_runqueue( 26288 struct task_struct * p) 26289 { 26290 struct task_struct *next = p->next_run; 26291 struct task_struct *prev = p->prev_run; 26292 26293 nr_running--; 26294 next->prev_run = prev; 26295 prev->next_run = next; 26296 p->next_run = NULL; 26297 p->prev_run = NULL; 26298 } 26299 26300 static inline void move_last_runqueue( 26301 struct task_struct * p) 26302 { 26303 struct task_struct *next = p->next_run; 26304 struct task_struct *prev = p->prev_run; 26305 26306 /* remove from list */ 26307 next->prev_run = prev; 26308 prev->next_run = next; 26309 /* add back to list */ 26310 p->next_run = &init_task; 26311 prev = init_task.prev_run; 26312 init_task.prev_run = p; 26313 p->prev_run = prev; 26314 prev->next_run = p; 26315 } 26316 26317 static inline void 26318 move_first_runqueue(struct task_struct * p) 26319 { 26320 struct task_struct *next = p->next_run; 26321 struct task_struct *prev = p->prev_run; 26322 26323 /* remove from list */ 26324 next->prev_run = prev; 26325 prev->next_run = next; 26326 /* add back to list */ 26327 p->prev_run = &init_task; 26328 next = init_task.next_run; 26329 init_task.next_run = p; 26330 p->next_run = next; 26331 next->prev_run = p; 26332 } 26333 26334 /* The tasklist_lock protects the linked list of 26335 * processes. 26336 * 26337 * The scheduler lock is protecting against multiple 26338 * entry into the scheduling code, and doesn't need to 26339 * worry about interrupts (because interrupts cannot call 26340 * the scheduler). 26341 * 26342 * The run-queue lock locks the parts that actually 26343 * access and change the run-queues, and have to be 26344 * interrupt-safe. */ 26345 /* should be acquired first */ 26346 spinlock_t scheduler_lock = SPIN_LOCK_UNLOCKED; 26347 spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* 2nd */ 26348 rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* 3rd */ 26349 26350 /* Wake up a process. Put it on the run-queue if it's not 26351 * already there. The "current" process is always on the 26352 * run-queue (except when the actual re-schedule is in 26353 * progress), and as such you're allowed to do the 26354 * simpler "current->state = TASK_RUNNING" to mark 26355 * yourself runnable without the overhead of this. */ 26356 void wake_up_process(struct task_struct * p) 26357 { 26358 unsigned long flags; 26359 26360 spin_lock_irqsave(&runqueue_lock, flags); 26361 p->state = TASK_RUNNING; 26362 if (!p->next_run) { 26363 add_to_runqueue(p); 26364 reschedule_idle(p); 26365 } 26366 spin_unlock_irqrestore(&runqueue_lock, flags); 26367 } 26368 26369 static void process_timeout(unsigned long __data) 26370 { 26371 struct task_struct * p = (struct task_struct *) __data; 26372 26373 wake_up_process(p); 26374 } 26375 26376 /* This is the function that decides how desirable a 26377 * process is.. You can weigh different processes 26378 * against each other depending on what CPU they've run 26379 * on lately etc to try to handle cache and TLB miss 26380 * penalties. 26381 * 26382 * Return values: 26383 * -1000: never select this 26384 * 0: out of time, recalculate counters 26385 * (but it might still be selected) 26386 * +ve: "goodness" value (the larger, the better) 26387 * +1000: realtime process, select this. */ 26388 static inline int goodness(struct task_struct * p, 26389 struct task_struct * prev, int this_cpu) 26390 { 26391 int policy = p->policy; 26392 int weight; 26393

26394 if (policy & SCHED_YIELD) { 26395 p->policy = policy & ~SCHED_YIELD; 26396 return 0; 26397 } 26398 26399 /* Realtime process, select the first one on the 26400 * runqueue (taking priorities within processes into 26401 * account). */ 26402 if (policy != SCHED_OTHER) 26403 return 1000 + p->rt_priority; 26404 26405 /* Give the process a first-approximation goodness 26406 * value according to the number of clock-ticks it has 26407 * left. 26408 * 26409 * Don't do any other calculations if the time slice is 26410 * over.. */ 26411 weight = p->counter; 26412 if (weight) { 26413 26414 #ifdef __SMP__ 26415 /* Give a largish advantage to the same processor... 26416 * (this is equivalent to penalizing other 26417 * processors) */ 26418 if (p->processor == this_cpu) 26419 weight += PROC_CHANGE_PENALTY; 26420 #endif 26421 26422 /* .. and a slight advantage to the current thread */ 26423 if (p->mm == prev->mm) 26424 weight += 1; 26425 weight += p->priority; 26426 } 26427 26428 return weight; 26429 } 26430 26431 /* Event timer code */ 26432 #define TVN_BITS 6 26433 #define TVR_BITS 8 26434 #define TVN_SIZE (1 << TVN_BITS) 26435 #define TVR_SIZE (1 << TVR_BITS) 26436 #define TVN_MASK (TVN_SIZE - 1) 26437 #define TVR_MASK (TVR_SIZE - 1) 26438 26439 struct timer_vec { 26440 int index; 26441 struct timer_list *vec[TVN_SIZE]; 26442 }; 26443 26444 struct timer_vec_root { 26445 int index; 26446 struct timer_list *vec[TVR_SIZE]; 26447 }; 26448 26449 static struct timer_vec tv5 = { 0 }; 26450 static struct timer_vec tv4 = { 0 }; 26451 static struct timer_vec tv3 = { 0 }; 26452 static struct timer_vec tv2 = { 0 }; 26453 static struct timer_vec_root tv1 = { 0 }; 26454 26455 static struct timer_vec * const tvecs[] = { 26456 (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5 26457 }; 26458 26459 #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0])) 26460 26461 static unsigned long timer_jiffies = 0; 26462 26463 static inline void insert_timer(struct timer_list *timer, 26464 struct timer_list **vec, int idx) 26465 { 26466 if ((timer->next = vec[idx])) 26467 vec[idx]->prev = timer; 26468 vec[idx] = timer; 26469 timer->prev = (struct timer_list *)&vec[idx]; 26470 } 26471 26472 static inline void internal_add_timer( 26473 struct timer_list *timer) 26474 { 26475 /* must be cli-ed when calling this */ 26476 unsigned long expires = timer->expires; 26477 unsigned long idx = expires - timer_jiffies; 26478 26479 if (idx < TVR_SIZE) { 26480 int i = expires & TVR_MASK; 26481 insert_timer(timer, tv1.vec, i); 26482 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { 26483 int i = (expires >> TVR_BITS) & TVN_MASK; 26484 insert_timer(timer, tv2.vec, i); 26485 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { 26486 int i =(expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; 26487 insert_timer(timer, tv3.vec, i); 26488 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { 26489 int i = 26490 (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; 26491 insert_timer(timer, tv4.vec, i); 26492 } else if ((signed long) idx < 0) { 26493 /* can happen if you add a timer with expires == 26494 * jiffies, or you set a timer to go off in the past 26495 */ 26496 insert_timer(timer, tv1.vec, tv1.index); 26497 } else if (idx <= 0xffffffffUL) { 26498 int i = 26499 (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; 26500 insert_timer(timer, tv5.vec, i); 26501 } else { 26502 /* Can only get here on architectures with 64-bit 26503 * jiffies */ 26504 timer->next = timer->prev = timer; 26505 } 26506 } 26507 26508 spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED; 26509 26510 void add_timer(struct timer_list *timer) 26511 { 26512 unsigned long flags; 26513 26514 spin_lock_irqsave(&timerlist_lock, flags); 26515 if (timer->prev) 26516 goto bug; 26517 internal_add_timer(timer); 26518 out: 26519 spin_unlock_irqrestore(&timerlist_lock, flags); 26520 return; 26521 26522 bug: 26523 printk("bug: kernel timer added twice at %p.\n", 26524 __builtin_return_address(0)); 26525 goto out; 26526 } 26527 26528 static inline int detach_timer(struct timer_list *timer) 26529 { 26530 struct timer_list *prev = timer->prev; 26531 if (prev) { 26532 struct timer_list *next = timer->next; 26533 prev->next = next; 26534 if (next) 26535 next->prev = prev; 26536 return 1; 26537 } 26538 return 0; 26539 } 26540 26541 void mod_timer(struct timer_list *timer, 26542 unsigned long expires) 26543 { 26544 unsigned long flags; 26545 26546 spin_lock_irqsave(&timerlist_lock, flags); 26547 timer->expires = expires; 26548 detach_timer(timer); 26549 internal_add_timer(timer); 26550 spin_unlock_irqrestore(&timerlist_lock, flags); 26551 } 26552 26553 int del_timer(struct timer_list * timer) 26554 { 26555 int ret; 26556 unsigned long flags; 26557 26558 spin_lock_irqsave(&timerlist_lock, flags); 26559 ret = detach_timer(timer); 26560 timer->next = timer->prev = 0; 26561 spin_unlock_irqrestore(&timerlist_lock, flags); 26562 return ret; 26563 } 26564 26565 #ifdef __SMP__ 26566 26567 #define idle_task (task[cpu_number_map[this_cpu]]) 26568 #define can_schedule(p) (!(p)->has_cpu) 26569 26570 #else 26571 26572 #define idle_task (&init_task) 26573 #define can_schedule(p) (1) 26574 26575 #endif 26576 26577 signed long schedule_timeout(signed long timeout) 26578 { 26579 struct timer_list timer; 26580 unsigned long expire; 26581 26582 switch (timeout) 26583 { 26584 case MAX_SCHEDULE_TIMEOUT: 26585 /* These two special cases are useful to be 26586 * comfortable in the caller. Nothing more. We could 26587 * take MAX_SCHEDULE_TIMEOUT from one of the negative 26588 * value but I' d like to return a valid offset (>=0) 26589 * to allow the caller to do everything it want with 26590 * the retval. */ 26591 schedule(); 26592 goto out; 26593 default: 26594 /* Another bit of PARANOID. Note that the retval will 26595 * be 0 since no piece of kernel is supposed to do a 26596 * check for a negative retval of schedule_timeout() 26597 * (since it should never happens anyway). You just 26598 * have the printk() that will tell you if something 26599 * is gone wrong and where. */ 26600 if (timeout < 0) 26601 { 26602 printk(KERN_ERR "schedule_timeout: wrong timeout " 26603 "value %lx from %p\n", timeout, 26604 __builtin_return_address(0)); 26605 goto out; 26606 } 26607 } 26608 26609 expire = timeout + jiffies; 26610 26611 init_timer(&timer); 26612 timer.expires = expire; 26613 timer.data = (unsigned long) current; 26614 timer.function = process_timeout; 26615 26616 add_timer(&timer); 26617 schedule(); 26618 del_timer(&timer); 26619 26620 timeout = expire - jiffies; 26621 26622 out: 26623 return timeout < 0 ? 0 : timeout; 26624 } 26625 26626 /* This one aligns per-CPU data on cacheline boundaries. 26627 */ 26628 static union { 26629 struct schedule_data { 26630 struct task_struct * prev; 26631 long prevstate; 26632 cycles_t last_schedule; 26633 } schedule_data; 26634 char __pad [SMP_CACHE_BYTES]; 26635 } aligned_data [NR_CPUS] __cacheline_aligned = 26636 { {{&init_task,0}}}; 26637 26638 static inline void __schedule_tail (void) 26639 { 26640 #ifdef __SMP__ 26641 struct schedule_data * sched_data; 26642 26643 /* We might have switched CPUs: */ 26644 sched_data = 26645 &aligned_data[smp_processor_id()].schedule_data; 26646 26647 /* Subtle. In the rare event that we got a wakeup to 26648 * 'prev' just during the reschedule (this is possible, 26649 * the scheduler is pretty parallel), we should do 26650 * another reschedule in the next task's 26651 * context. schedule() will do the right thing next 26652 * time around. This is equivalent to 'delaying' the 26653 * wakeup until the reschedule has finished. */

26654 if (sched_data->prev->state != sched_data->prevstate) 26655 current->need_resched = 1; 26656 26657 /* Release the previous process ... 26658 * 26659 * We have dropped all locks, and we must make sure 26660 * that we only mark the previous process as no longer 26661 * having a CPU after all other state has been seen by 26662 * other CPUs. Thus the write memory barrier! */ 26663 wmb(); 26664 sched_data->prev->has_cpu = 0; 26665 #endif /* __SMP__ */ 26666 } 26667 26668 /* schedule_tail() is getting called from the fork return 26669 * path. This cleans up all remaining scheduler things, 26670 * without impacting the common case. */ 26671 void schedule_tail (void) 26672 { 26673 __schedule_tail(); 26674 } 26675 26676 /* 'schedule()' is the scheduler function. It's a very 26677 * simple and nice scheduler: it's not perfect, but 26678 * certainly works for most things. 26679 * 26680 * The goto is "interesting". 26681 * 26682 * NOTE!! Task 0 is the 'idle' task, which gets called 26683 * when no other tasks can run. It can not be killed, and 26684 * it cannot sleep. The 'state' information in task[0] is 26685 * never used. */ 26686 asmlinkage void schedule(void) 26687 { 26688 struct schedule_data * sched_data;

26689 struct task_struct * prev, * next; 26690 int this_cpu; 26691 26692 run_task_queue(&tq_scheduler); 26693 26694 prev = current; 26695 this_cpu = prev->processor; 26696 /* 'sched_data' is protected by the fact that we can 26697 * run only one process per CPU. */ 26698 sched_data = & aligned_data[this_cpu].schedule_data; 26699 26700 if (in_interrupt()) 26701 goto scheduling_in_interrupt; 26702 release_kernel_lock(prev, this_cpu); 26703 26704 /* Do "administrative" work here while we don't hold 26705 * any locks */ 26706 if (bh_active & bh_mask) 26707 do_bottom_half(); 26708 26709 spin_lock(&scheduler_lock); 26710 spin_lock_irq(&runqueue_lock); 26711 26712 /* move an exhausted RR process to be last.. */ 26713 prev->need_resched = 0; 26714 26715 if (!prev->counter && prev->policy == SCHED_RR) { 26716 prev->counter = prev->priority; 26717 move_last_runqueue(prev); 26718 } 26719 26720 switch (prev->state) { 26721 case TASK_INTERRUPTIBLE: 26722 if (signal_pending(prev)) { 26723 prev->state = TASK_RUNNING; 26724 break; 26725 } 26726 default: 26727 del_from_runqueue(prev); 26728 case TASK_RUNNING: 26729 } 26730 26731 sched_data->prevstate = prev->state; 26732 26733 /* this is the scheduler proper: */ 26734 { 26735 struct task_struct * p = init_task.next_run; 26736 int c = -1000; 26737 26738 /* Default process to select.. */ 26739 next = idle_task; 26740 if (prev->state == TASK_RUNNING) { 26741 c = goodness(prev, prev, this_cpu); 26742 next = prev; 26743 } 26744 26745 /* This is subtle. Note how we can enable interrupts 26746 * here, even though interrupts can add processes to 26747 * the run- queue. This is because any new processes 26748 * will be added to the front of the queue, so "p" 26749 * above is a safe starting point. run-queue 26750 * deletion and re-ordering is protected by the 26751 * scheduler lock */ 26752 spin_unlock_irq(&runqueue_lock); 26753 /* Note! there may appear new tasks on the run-queue 26754 * during this, as interrupts are enabled. However, they 26755 * will be put on front of the list, so our list starting 26756 * at "p" is essentially fixed. */ 26757 while (p != &init_task) { 26758 if (can_schedule(p)) { 26759 int weight = goodness(p, prev, this_cpu); 26760 if (weight > c) 26761 c = weight, next = p; 26762 } 26763 p = p->next_run; 26764 } 26765 26766 /* Do we need to re-calculate counters? */

26767 if (!c) { 26768 struct task_struct *p; 26769 read_lock(&tasklist_lock); 26770 for_each_task(p) 26771 p->counter = (p->counter >> 1) + p->priority; 26772 read_unlock(&tasklist_lock); 26773 } 26774 } 26775 26776 /* maintain the per-process 'average timeslice' value. 26777 * ( this has to be recalculated even if we reschedule 26778 * to the same process) Currently this is only used on 26779 * SMP: */ 26780 #ifdef __SMP__ 26781 { 26782 cycles_t t, this_slice; 26783

26784 t = get_cycles(); 26785 this_slice = t - sched_data->last_schedule; 26786 sched_data->last_schedule = t; 26787 26788 /* Simple, exponentially fading average calculation: 26789 */ 26790 prev->avg_slice = this_slice + prev->avg_slice; 26791 prev->avg_slice >>= 1; 26792 } 26793 26794 /* We drop the scheduler lock early (it's a global 26795 * spinlock), thus we have to lock the previous process 26796 * from getting rescheduled during switch_to(). */

26797 next->processor = this_cpu; 26798 next->has_cpu = 1; 26799 spin_unlock(&scheduler_lock); 26800 #endif /* __SMP__ */ 26801 if (prev != next) { 26802 #ifdef __SMP__ 26803 sched_data->prev = prev; 26804 #endif 26805 kstat.context_swtch++; 26806 get_mmu_context(next); 26807 switch_to(prev,next); 26808 26809 __schedule_tail(); 26810 } 26811 26812 reacquire_kernel_lock(current); 26813 return; 26814 26815 scheduling_in_interrupt: 26816 printk("Scheduling in interrupt\n"); 26817 *(int *)0 = 0; 26818 } 26819 26820 rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED; 26821 26822 /* wake_up doesn't wake up stopped processes - they have 26823 * to be awakened with signals or similar. 26824 * 26825 * Note that we only need a read lock for the wait queue 26826 * (and thus do not have to protect against interrupts), 26827 * as the actual removal from the queue is handled by the 26828 * process itself. */

26829 void __wake_up(struct wait_queue **q, unsigned int mode) 26830 { 26831 struct wait_queue *next; 26832 26833 read_lock(&waitqueue_lock); 26834 if (q && (next = *q)) { 26835 struct wait_queue *head; 26836 26837 head = WAIT_QUEUE_HEAD(q); 26838 while (next != head) { 26839 struct task_struct *p = next->task; 26840 next = next->next; 26841 if (p->state & mode) 26842 wake_up_process(p); 26843 } 26844 } 26845 read_unlock(&waitqueue_lock); 26846 } 26847 26848 /* Semaphores are implemented using a two-way counter: 26849 * The "count" variable is decremented for each process 26850 * that tries to sleep, while the "waking" variable is 26851 * incremented when the "up()" code goes to wake up 26852 * waiting processes. 26853 * 26854 * Notably, the inline "up()" and "down()" functions can 26855 * efficiently test if they need to do any extra work (up 26856 * needs to do something only if count was negative 26857 * before the increment operation. 26858 * 26859 * waking_non_zero() (from asm/semaphore.h) must execute 26860 * atomically. 26861 * 26862 * When __up() is called, the count was negative before 26863 * incrementing it, and we need to wake up somebody. 26864 * 26865 * This routine adds one to the count of processes that 26866 * need to wake up and exit. ALL waiting processes 26867 * actually wake up but only the one that gets to the 26868 * "waking" field first will gate through and acquire the 26869 * semaphore. The others will go back to sleep. 26870 * 26871 * Note that these functions are only called when there 26872 * is contention on the lock, and as such all this is the 26873 * "non-critical" part of the whole semaphore 26874 * business. The critical part is the inline stuff in 26875 * <asm/semaphore.h> where we want to avoid any extra 26876 * jumps and calls. */

26877 void __up(struct semaphore *sem) 26878 { 26879 wake_one_more(sem); 26880 wake_up(&sem->wait); 26881 } 26882 26883 /* Perform the "down" function. Return zero for 26884 * semaphore acquired, return negative for signalled out 26885 * of the function. 26886 * 26887 * If called from __down, the return is ignored and the 26888 * wait loop is not interruptible. This means that a 26889 * task waiting on a semaphore using "down()" cannot be 26890 * killed until someone does an "up()" on the semaphore. 26891 * 26892 * If called from __down_interruptible, the return value 26893 * gets checked upon return. If the return value is 26894 * negative then the task continues with the negative 26895 * value in the return register (it can be tested by the 26896 * caller). 26897 * 26898 * Either form may be used in conjunction with "up()". */ 26899

26900 #define DOWN_VAR \ 26901 struct task_struct *tsk = current; \ 26902 struct wait_queue wait = { tsk, NULL }; 26903

26904 #define DOWN_HEAD(task_state) \ 26905 \ 26906 tsk->state = (task_state); \ 26907 add_wait_queue(&sem->wait, &wait); \ 26908 \ 26909 /* Ok, we're set up. sem->count is known to be less \ 26910 * than zero so we must wait. \ 26911 * \ 26912 * We can let go the lock for purposes of waiting. \ 26913 * We re-acquire it after awaking so as to protect \ 26914 * all semaphore operations. \ 26915 * \ 26916 * If "up()" is called before we call \ 26917 * waking_non_zero() then we will catch it right away.\ 26918 * If it is called later then we will have to go \ 26919 * through a wakeup cycle to catch it. \ 26920 * \ 26921 * Multiple waiters contend for the semaphore lock to \ 26922 * see who gets to gate through and who has to wait \ 26923 * some more. */ \ 26924 for (;;) { 26925

26926 #define DOWN_TAIL(task_state) \ 26927 tsk->state = (task_state); \ 26928 } \ 26929 tsk->state = TASK_RUNNING; \ 26930 remove_wait_queue(&sem->wait, &wait); 26931

26932 void __down(struct semaphore * sem) 26933 { 26934 DOWN_VAR 26935 DOWN_HEAD(TASK_UNINTERRUPTIBLE) 26936 if (waking_non_zero(sem)) 26937 break; 26938 schedule(); 26939 DOWN_TAIL(TASK_UNINTERRUPTIBLE) 26940 } 26941

26942 int __down_interruptible(struct semaphore * sem) 26943 { 26944 DOWN_VAR 26945 int ret = 0; 26946 DOWN_HEAD(TASK_INTERRUPTIBLE) 26947 26948 ret = waking_non_zero_interruptible(sem, tsk); 26949 if (ret) 26950 { 26951 if (ret == 1) 26952 /* ret != 0 only if we get interrupted -arca */ 26953 ret = 0; 26954 break; 26955 } 26956 schedule(); 26957 DOWN_TAIL(TASK_INTERRUPTIBLE) 26958 return ret; 26959 } 26960

26961 int __down_trylock(struct semaphore * sem) 26962 { 26963 return waking_non_zero_trylock(sem); 26964 } 26965 26966 #define SLEEP_ON_VAR \ 26967 unsigned long flags; \ 26968 struct wait_queue wait; 26969 26970 #define SLEEP_ON_HEAD \ 26971 wait.task = current; \ 26972 write_lock_irqsave(&waitqueue_lock, flags); \ 26973 __add_wait_queue(p, &wait); \ 26974 write_unlock(&waitqueue_lock); 26975 26976 #define SLEEP_ON_TAIL \ 26977 write_lock_irq(&waitqueue_lock); \ 26978 __remove_wait_queue(p, &wait); \ 26979 write_unlock_irqrestore(&waitqueue_lock, flags); 26980 26981 void interruptible_sleep_on(struct wait_queue **p) 26982 { 26983 SLEEP_ON_VAR 26984 26985 current->state = TASK_INTERRUPTIBLE; 26986 26987 SLEEP_ON_HEAD 26988 schedule(); 26989 SLEEP_ON_TAIL 26990 } 26991 26992 long interruptible_sleep_on_timeout( 26993 struct wait_queue **p, long timeout) 26994 { 26995 SLEEP_ON_VAR 26996 26997 current->state = TASK_INTERRUPTIBLE; 26998 26999 SLEEP_ON_HEAD 27000 timeout = schedule_timeout(timeout); 27001 SLEEP_ON_TAIL 27002 27003 return timeout; 27004 } 27005 27006 void sleep_on(struct wait_queue **p) 27007 { 27008 SLEEP_ON_VAR 27009 27010 current->state = TASK_UNINTERRUPTIBLE; 27011 27012 SLEEP_ON_HEAD 27013 schedule(); 27014 SLEEP_ON_TAIL 27015 } 27016 27017 long sleep_on_timeout(struct wait_queue **p, 27018 long timeout) 27019 { 27020 SLEEP_ON_VAR 27021 27022 current->state = TASK_UNINTERRUPTIBLE; 27023 27024 SLEEP_ON_HEAD 27025 timeout = schedule_timeout(timeout); 27026 SLEEP_ON_TAIL 27027 27028 return timeout; 27029 } 27030 27031 void scheduling_functions_end_here(void) { } 27032 27033 static inline void cascade_timers(struct timer_vec *tv) 27034 { 27035 /* cascade all the timers from tv up one level */ 27036 struct timer_list *timer; 27037 timer = tv->vec[tv->index]; 27038 /* We are removing _all_ timers from the list, so we 27039 * don't have to detach them individually, just clear 27040 * the list afterwards. */ 27041 while (timer) { 27042 struct timer_list *tmp = timer; 27043 timer = timer->next; 27044 internal_add_timer(tmp); 27045 } 27046 tv->vec[tv->index] = NULL; 27047 tv->index = (tv->index + 1) & TVN_MASK; 27048 } 27049 27050 static inline void run_timer_list(void) 27051 { 27052 spin_lock_irq(&timerlist_lock); 27053 while ((long)(jiffies - timer_jiffies) >= 0) { 27054 struct timer_list *timer; 27055 if (!tv1.index) { 27056 int n = 1; 27057 do { 27058 cascade_timers(tvecs[n]); 27059 } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS); 27060 } 27061 while ((timer = tv1.vec[tv1.index])) { 27062 void (*fn)(unsigned long) = timer->function; 27063 unsigned long data = timer->data; 27064 detach_timer(timer); 27065 timer->next = timer->prev = NULL; 27066 spin_unlock_irq(&timerlist_lock); 27067 fn(data); 27068 spin_lock_irq(&timerlist_lock); 27069 } 27070 ++timer_jiffies; 27071 tv1.index = (tv1.index + 1) & TVR_MASK; 27072 } 27073 spin_unlock_irq(&timerlist_lock); 27074 } 27075 27076

27077 static inline void run_old_timers(void) 27078 { 27079 struct timer_struct *tp; 27080 unsigned long mask; 27081 27082 for (mask = 1, tp = timer_table+0; mask; 27083 tp++,mask += mask) { 27084 if (mask > timer_active) 27085 break; 27086 if (!(mask & timer_active)) 27087 continue; 27088 if (time_after(tp->expires, jiffies)) 27089 continue; 27090 timer_active &= ~mask; 27091 tp->fn(); 27092 sti(); 27093 } 27094 } 27095 27096 spinlock_t tqueue_lock; 27097 27098 void tqueue_bh(void) 27099 { 27100 run_task_queue(&tq_timer); 27101 } 27102 27103 void immediate_bh(void) 27104 { 27105 run_task_queue(&tq_immediate); 27106 } 27107 27108 unsigned long timer_active = 0; 27109 struct timer_struct timer_table[32]; 27110 27111 /* Hmm.. Changed this, as the GNU make sources (load.c) 27112 * seems to imply that avenrun[] is the standard name for 27113 * this kind of thing. Nothing else seems to be 27114 * standardized: the fractional size etc all seem to 27115 * differ on different machines. */ 27116 unsigned long avenrun[3] = { 0,0,0 }; 27117 27118 /* Nr of active tasks - counted in fixed-point numbers */ 27119 static unsigned long count_active_tasks(void) 27120 { 27121 struct task_struct *p; 27122 unsigned long nr = 0; 27123 27124 read_lock(&tasklist_lock); 27125 for_each_task(p) { 27126 if ((p->state == TASK_RUNNING 27127 p->state == TASK_UNINTERRUPTIBLE 27128 p->state == TASK_SWAPPING)) 27129 nr += FIXED_1; 27130 } 27131 read_unlock(&tasklist_lock); 27132 return nr; 27133 } 27134

27135 static inline void calc_load(unsigned long ticks) 27136 { 27137 unsigned long active_tasks; /* fixed-point */ 27138 static int count = LOAD_FREQ; 27139 27140 count -= ticks; 27141 if (count < 0) { 27142 count += LOAD_FREQ; 27143 active_tasks = count_active_tasks(); 27144 CALC_LOAD(avenrun[0], EXP_1, active_tasks); 27145 CALC_LOAD(avenrun[1], EXP_5, active_tasks); 27146 CALC_LOAD(avenrun[2], EXP_15, active_tasks); 27147 } 27148 } 27149 27150 /* this routine handles the overflow of the microsecond 27151 * field 27152 * 27153 * The tricky bits of code to handle the accurate clock 27154 * support were provided by Dave Mills (Mills@UDEL.EDU) 27155 * of NTP fame. They were originally developed for SUN 27156 * and DEC kernels. All the kudos should go to Dave for 27157 * this stuff. */ 27158 static void second_overflow(void) 27159 { 27160 long ltemp; 27161 27162 /* Bump the maxerror field */ 27163 time_maxerror += time_tolerance >> SHIFT_USEC; 27164 if ( time_maxerror > NTP_PHASE_LIMIT ) { 27165 time_maxerror = NTP_PHASE_LIMIT; 27166 time_status |= STA_UNSYNC; 27167 } 27168 27169 /* Leap second processing. If in leap-insert state at 27170 * the end of the day, the system clock is set back one 27171 * second; if in leap-delete state, the system clock is 27172 * set ahead one second. The microtime() routine or 27173 * external clock driver will insure that reported time 27174 * is always monotonic. The ugly divides should be 27175 * replaced. */ 27176 switch (time_state) { 27177 27178 case TIME_OK: 27179 if (time_status & STA_INS) 27180 time_state = TIME_INS; 27181 else if (time_status & STA_DEL) 27182 time_state = TIME_DEL; 27183 break; 27184 27185 case TIME_INS: 27186 if (xtime.tv_sec % 86400 == 0) { 27187 xtime.tv_sec--; 27188 time_state = TIME_OOP; 27189 printk(KERN_NOTICE "Clock: " 27190 "inserting leap second 23:59:60 UTC\n"); 27191 } 27192 break; 27193 27194 case TIME_DEL: 27195 if ((xtime.tv_sec + 1) % 86400 == 0) { 27196 xtime.tv_sec++; 27197 time_state = TIME_WAIT; 27198 printk(KERN_NOTICE "Clock: " 27199 "deleting leap second 23:59:59 UTC\n"); 27200 } 27201 break; 27202 27203 case TIME_OOP: 27204 time_state = TIME_WAIT; 27205 break; 27206 27207 case TIME_WAIT: 27208 if (!(time_status & (STA_INS | STA_DEL))) 27209 time_state = TIME_OK; 27210 } 27211 27212 /* Compute the phase adjustment for the next second. In 27213 * PLL mode, the offset is reduced by a fixed factor 27214 * times the time constant. In FLL mode the offset is 27215 * used directly. In either mode, the maximum phase 27216 * adjustment for each second is clamped so as to 27217 * spread the adjustment over not more than the number 27218 * of seconds between updates. */ 27219 if (time_offset < 0) { 27220 ltemp = -time_offset; 27221 if (!(time_status & STA_FLL)) 27222 ltemp >>= SHIFT_KG + time_constant; 27223 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) 27224 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; 27225 time_offset += ltemp; 27226 time_adj = -ltemp << 27227 (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); 27228 } else { 27229 ltemp = time_offset; 27230 if (!(time_status & STA_FLL)) 27231 ltemp >>= SHIFT_KG + time_constant; 27232 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) 27233 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; 27234 time_offset -= ltemp; 27235 time_adj = ltemp << 27236 (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); 27237 } 27238 27239 /* Compute the frequency estimate and additional phase 27240 * adjustment due to frequency error for the next 27241 * second. When the PPS signal is engaged, gnaw on the 27242 * watchdog counter and update the frequency computed 27243 * by the pll and the PPS signal. */ 27244 pps_valid++; 27245 if (pps_valid == PPS_VALID) { /* PPS signal lost */ 27246 pps_jitter = MAXTIME; 27247 pps_stabil = MAXFREQ; 27248 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | 27249 STA_PPSWANDER | STA_PPSERROR); 27250 } 27251 ltemp = time_freq + pps_freq; 27252 if (ltemp < 0) 27253 time_adj -= -ltemp >> 27254 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); 27255 else 27256 time_adj += ltemp >> 27257 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); 27258 27259 #if HZ == 100 27260 /* Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 27261 * 25% and 3.125% to get 128.125; => only 0.125% error 27262 * (p. 14) */ 27263 if (time_adj < 0) 27264 time_adj -= (-time_adj >> 2) + (-time_adj >> 5); 27265 else 27266 time_adj += (time_adj >> 2) + (time_adj >> 5); 27267 #endif 27268 } 27269 27270 /* in the NTP reference this is called "hardclock()" */ 27271 static void update_wall_time_one_tick(void) 27272 { 27273 if ( (time_adjust_step = time_adjust) != 0 ) { 27274 /* We are doing an adjtime thing. 27275 * 27276 * Prepare time_adjust_step to be within bounds. 27277 * Note that a positive time_adjust means we want the 27278 * clock to run faster. 27279 * 27280 * Limit the amount of the step to be in the range 27281 * -tickadj .. +tickadj */ 27282 if (time_adjust > tickadj) 27283 time_adjust_step = tickadj; 27284 else if (time_adjust < -tickadj) 27285 time_adjust_step = -tickadj; 27286 27287 /* Reduce by this step the amount of time left */ 27288 time_adjust -= time_adjust_step; 27289 } 27290 xtime.tv_usec += tick + time_adjust_step; 27291 /* Advance the phase, once it gets to one microsecond, 27292 * then advance the tick more. */ 27293 time_phase += time_adj; 27294 if (time_phase <= -FINEUSEC) { 27295 long ltemp = -time_phase >> SHIFT_SCALE; 27296 time_phase += ltemp << SHIFT_SCALE; 27297 xtime.tv_usec -= ltemp; 27298 } 27299 else if (time_phase >= FINEUSEC) { 27300 long ltemp = time_phase >> SHIFT_SCALE; 27301 time_phase -= ltemp << SHIFT_SCALE; 27302 xtime.tv_usec += ltemp; 27303 } 27304 } 27305 27306 /* Using a loop looks inefficient, but "ticks" is usually 27307 * just one (we shouldn't be losing ticks, we're doing 27308 * this this way mainly for interrupt latency reasons, 27309 * not because we think we'll have lots of lost timer 27310 * ticks */ 27311 static void update_wall_time(unsigned long ticks) 27312 {

27313 do { 27314 ticks--; 27315 update_wall_time_one_tick(); 27316 } while (ticks); 27317 27318 if (xtime.tv_usec >= 1000000) { 27319 xtime.tv_usec -= 1000000; 27320 xtime.tv_sec++; 27321 second_overflow(); 27322 } 27323 } 27324 27325 static inline void do_process_times( 27326 struct task_struct *p, unsigned long user, 27327 unsigned long system) 27328 { 27329 long psecs; 27330 27331 psecs = (p->times.tms_utime += user); 27332 psecs += (p->times.tms_stime += system); 27333 if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) { 27334 /* Send SIGXCPU every second.. */ 27335 if (!(psecs % HZ)) 27336 send_sig(SIGXCPU, p, 1); 27337 /* and SIGKILL when we go over max.. */ 27338 if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max) 27339 send_sig(SIGKILL, p, 1); 27340 } 27341 } 27342 27343 static inline void do_it_virt(struct task_struct * p, 27344 unsigned long ticks) 27345 { 27346 unsigned long it_virt = p->it_virt_value; 27347 27348 if (it_virt) { 27349 if (it_virt <= ticks) { 27350 it_virt = ticks + p->it_virt_incr; 27351 send_sig(SIGVTALRM, p, 1); 27352 } 27353 p->it_virt_value = it_virt - ticks; 27354 } 27355 } 27356 27357 static inline void do_it_prof(struct task_struct * p, 27358 unsigned long ticks) 27359 { 27360 unsigned long it_prof = p->it_prof_value; 27361 27362 if (it_prof) { 27363 if (it_prof <= ticks) { 27364 it_prof = ticks + p->it_prof_incr; 27365 send_sig(SIGPROF, p, 1); 27366 } 27367 p->it_prof_value = it_prof - ticks; 27368 } 27369 } 27370 27371 void update_one_process(struct task_struct *p, 27372 unsigned long ticks, unsigned long user, 27373 unsigned long system, int cpu) 27374 { 27375 p->per_cpu_utime[cpu] += user; 27376 p->per_cpu_stime[cpu] += system; 27377 do_process_times(p, user, system); 27378 do_it_virt(p, user); 27379 do_it_prof(p, ticks); 27380 } 27381 27382 static void update_process_times(unsigned long ticks, 27383 unsigned long system) 27384 { 27385 /* SMP does this on a per-CPU basis elsewhere */ 27386 #ifndef __SMP__ 27387 struct task_struct * p = current; 27388 unsigned long user = ticks - system; 27389 if (p->pid) { 27390 p->counter -= ticks; 27391 if (p->counter < 0) { 27392 p->counter = 0; 27393 p->need_resched = 1; 27394 } 27395 if (p->priority < DEF_PRIORITY) 27396 kstat.cpu_nice += user; 27397 else 27398 kstat.cpu_user += user; 27399 kstat.cpu_system += system; 27400 } 27401 update_one_process(p, ticks, user, system, 0); 27402 #endif 27403 } 27404 27405 volatile unsigned long lost_ticks = 0; 27406 static unsigned long lost_ticks_system = 0; 27407 27408 /* This spinlock protect us from races in SMP while 27409 * playing with xtime. -arca */ 27410 rwlock_t xtime_lock = RW_LOCK_UNLOCKED; 27411

27412 static inline void update_times(void) 27413 { 27414 unsigned long ticks; 27415 27416 /* update_times() is run from the raw timer_bh handler 27417 * so we just know that the irqs are locally enabled 27418 * and so we don't need to save/restore the flags of 27419 * the local CPU here. -arca */ 27420 write_lock_irq(&xtime_lock); 27421 27422 ticks = lost_ticks; 27423 lost_ticks = 0; 27424 27425 if (ticks) { 27426 unsigned long system;

27427 system = xchg(&lost_ticks_system, 0); 27428 27429 calc_load(ticks); 27430 update_wall_time(ticks); 27431 write_unlock_irq(&xtime_lock); 27432 27433 update_process_times(ticks, system); 27434 27435 } else 27436 write_unlock_irq(&xtime_lock); 27437 } 27438

27439 static void timer_bh(void) 27440 { 27441 update_times(); 27442 run_old_timers(); 27443 run_timer_list(); 27444 } 27445

27446 void do_timer(struct pt_regs * regs) 27447 { 27448 (*(unsigned long *)&jiffies)++; 27449 lost_ticks++; 27450 mark_bh(TIMER_BH); 27451 if (!user_mode(regs)) 27452 lost_ticks_system++; 27453 if (tq_timer) 27454 mark_bh(TQUEUE_BH); 27455 } 27456 27457 #ifndef __alpha__ 27458 27459 /* For backwards compatibility? This can be done in libc 27460 * so Alpha and all newer ports shouldn't need it. */ 27461 asmlinkage unsigned int sys_alarm(unsigned int seconds) 27462 { 27463 struct itimerval it_new, it_old; 27464 unsigned int oldalarm; 27465 27466 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec 27467 = 0; 27468 it_new.it_value.tv_sec = seconds; 27469 it_new.it_value.tv_usec = 0; 27470 do_setitimer(ITIMER_REAL, &it_new, &it_old); 27471 oldalarm = it_old.it_value.tv_sec; 27472 /* ehhh.. We can't return 0 if we have an alarm 27473 * pending.. And we'd better return too much than too 27474 * little anyway */ 27475 if (it_old.it_value.tv_usec) 27476 oldalarm++; 27477 return oldalarm; 27478 } 27479 27480 /* The Alpha uses getxpid, getxuid, and getxgid instead. 27481 * Maybe this should be moved into arch/i386 instead? */ 27482 27483 asmlinkage int sys_getpid(void) 27484 { 27485 /* This is SMP safe - current->pid doesn't change */ 27486 return current->pid; 27487 } 27488 27489 /* This is not strictly SMP safe: p_opptr could change 27490 * from under us. However, rather than getting any lock 27491 * we can use an optimistic algorithm: get the parent 27492 * pid, and go back and check that the parent is still 27493 * the same. If it has changed (which is extremely 27494 * unlikely indeed), we just try again.. 27495 * 27496 * NOTE! This depends on the fact that even if we _do_ 27497 * get an old value of "parent", we can happily 27498 * dereference the pointer: we just can't necessarily 27499 * trust the result until we know that the parent pointer 27500 * is valid. 27501 * 27502 * The "mb()" macro is a memory barrier - a synchronizing 27503 * event. It also makes sure that gcc doesn't optimize 27504 * away the necessary memory references.. The barrier 27505 * doesn't have to have all that strong semantics: on x86 27506 * we don't really require a synchronizing instruction, 27507 * for example. The barrier is more important for code 27508 * generation than for any real memory ordering semantics 27509 * (even if there is a small window for a race, using the 27510 * old pointer is harmless for a while). */ 27511 asmlinkage int sys_getppid(void) 27512 { 27513 int pid; 27514 struct task_struct * me = current; 27515 struct task_struct * parent; 27516 27517 parent = me->p_opptr; 27518 for (;;) { 27519 pid = parent->pid; 27520 #if __SMP__ 27521 { 27522 struct task_struct *old = parent; 27523 mb(); 27524 parent = me->p_opptr; 27525 if (old != parent) 27526 continue; 27527 } 27528 #endif 27529 break; 27530 } 27531 return pid; 27532 } 27533 27534 asmlinkage int sys_getuid(void) 27535 { 27536 /* Only we change this so SMP safe */ 27537 return current->uid; 27538 } 27539 27540 asmlinkage int sys_geteuid(void) 27541 { 27542 /* Only we change this so SMP safe */ 27543 return current->euid; 27544 } 27545 27546 asmlinkage int sys_getgid(void) 27547 { 27548 /* Only we change this so SMP safe */ 27549 return current->gid; 27550 } 27551 27552 asmlinkage int sys_getegid(void) 27553 { 27554 /* Only we change this so SMP safe */ 27555 return current->egid; 27556 } 27557 27558 /* This has been replaced by sys_setpriority. Maybe it 27559 * should be moved into the arch dependent tree for those 27560 * ports that require it for backward compatibility? */ 27561 27562 asmlinkage int sys_nice(int increment) 27563 { 27564 unsigned long newprio; 27565 int increase = 0; 27566 27567 /* Setpriority might change our priority at the same 27568 * moment. We don't have to worry. Conceptually one 27569 * call occurs first and we have a single winner. */ 27570 27571 newprio = increment; 27572 if (increment < 0) { 27573 if (!capable(CAP_SYS_NICE)) 27574 return -EPERM; 27575 newprio = -increment; 27576 increase = 1; 27577 } 27578 27579 if (newprio > 40) 27580 newprio = 40; 27581 /* do a "normalization" of the priority (traditionally 27582 * Unix nice values are -20 to 20; Linux doesn't really 27583 * use that kind of thing, but uses the length of the 27584 * timeslice instead (default 210 ms). The rounding is 27585 * why we want to avoid negative values. */ 27586 newprio = (newprio * DEF_PRIORITY + 10) / 20; 27587 increment = newprio; 27588 if (increase) 27589 increment = -increment; 27590 /* Current->priority can change between this point and 27591 * the assignment. We are assigning not doing add/subs 27592 * so thats ok. Conceptually a process might just 27593 * instantaneously read the value we stomp over. I 27594 * don't think that is an issue unless posix makes it 27595 * one. If so we can loop on changes to 27596 * current->priority. */ 27597 newprio = current->priority - increment; 27598 if ((signed) newprio < 1) 27599 newprio = 1; 27600 if (newprio > DEF_PRIORITY*2) 27601 newprio = DEF_PRIORITY*2; 27602 current->priority = newprio; 27603 return 0; 27604 } 27605 27606 #endif 27607 27608 static inline struct task_struct * 27609 find_process_by_pid(pid_t pid) 27610 { 27611 struct task_struct *tsk = current; 27612 27613 if (pid) 27614 tsk = find_task_by_pid(pid); 27615 return tsk; 27616 } 27617

27618 static int setscheduler( pid_t pid, int policy, 27619 struct sched_param *param) 27620 { 27621 struct sched_param lp; 27622 struct task_struct *p; 27623 int retval; 27624 27625 retval = -EINVAL; 27626 if (!param pid < 0) 27627 goto out_nounlock; 27628 27629 retval = -EFAULT; 27630 if (copy_from_user(&lp, param, 27631 sizeof(struct sched_param))) 27632 goto out_nounlock; 27633 27634 /* We play safe to avoid deadlocks. */ 27635 spin_lock(&scheduler_lock); 27636 spin_lock_irq(&runqueue_lock); 27637 read_lock(&tasklist_lock); 27638 27639 p = find_process_by_pid(pid); 27640 27641 retval = -ESRCH; 27642 if (!p) 27643 goto out_unlock; 27644 27645 if (policy < 0) 27646 policy = p->policy; 27647 else { 27648 retval = -EINVAL; 27649 if (policy != SCHED_FIFO && policy != SCHED_RR && 27650 policy != SCHED_OTHER) 27651 goto out_unlock; 27652 } 27653 27654 /* Valid priorities for SCHED_FIFO and SCHED_RR are 27655 * 1..99, valid priority for SCHED_OTHER is 0. */ 27656 retval = -EINVAL; 27657 if (lp.sched_priority < 0 lp.sched_priority > 99) 27658 goto out_unlock;

27659 if((policy == SCHED_OTHER) != (lp.sched_priority == 0)) 27660 goto out_unlock; 27661 27662 retval = -EPERM; 27663 if ((policy == SCHED_FIFO policy == SCHED_RR) && 27664 !capable(CAP_SYS_NICE)) 27665 goto out_unlock; 27666 if ((current->euid != p->euid) && 27667 (current->euid != p->uid) && 27668 !capable(CAP_SYS_NICE)) 27669 goto out_unlock; 27670 27671 retval = 0; 27672 p->policy = policy; 27673 p->rt_priority = lp.sched_priority; 27674 if (p->next_run) 27675 move_first_runqueue(p); 27676 27677 current->need_resched = 1; 27678 27679 out_unlock: 27680 read_unlock(&tasklist_lock); 27681 spin_unlock_irq(&runqueue_lock); 27682 spin_unlock(&scheduler_lock); 27683 27684 out_nounlock: 27685 return retval; 27686 } 27687 27688 asmlinkage int sys_sched_setscheduler(pid_t pid, 27689 int policy, struct sched_param *param) 27690 { 27691 return setscheduler(pid, policy, param); 27692 } 27693 27694 asmlinkage int sys_sched_setparam(pid_t pid, 27695 struct sched_param *param) 27696 { 27697 return setscheduler(pid, -1, param); 27698 } 27699 27700 asmlinkage int sys_sched_getscheduler(pid_t pid) 27701 { 27702 struct task_struct *p; 27703 int retval; 27704 27705 retval = -EINVAL; 27706 if (pid < 0) 27707 goto out_nounlock; 27708 27709 read_lock(&tasklist_lock); 27710 27711 retval = -ESRCH; 27712 p = find_process_by_pid(pid); 27713 if (!p) 27714 goto out_unlock; 27715 27716 retval = p->policy; 27717 27718 out_unlock: 27719 read_unlock(&tasklist_lock); 27720 27721 out_nounlock: 27722 return retval; 27723 } 27724 27725 asmlinkage int sys_sched_getparam(pid_t pid, 27726 struct sched_param *param) 27727 { 27728 struct task_struct *p; 27729 struct sched_param lp; 27730 int retval; 27731 27732 retval = -EINVAL; 27733 if (!param pid < 0) 27734 goto out_nounlock; 27735 27736 read_lock(&tasklist_lock); 27737 p = find_process_by_pid(pid); 27738 retval = -ESRCH; 27739 if (!p) 27740 goto out_unlock; 27741 lp.sched_priority = p->rt_priority; 27742 read_unlock(&tasklist_lock); 27743 27744 /* This one might sleep, we cannot do it with a 27745 * spinlock held ... */ 27746 retval = copy_to_user(param, &lp, 27747 sizeof(*param)) ? -EFAULT : 0; 27748 27749 out_nounlock: 27750 return retval; 27751 27752 out_unlock: 27753 read_unlock(&tasklist_lock); 27754 return retval; 27755 } 27756 27757 asmlinkage int sys_sched_yield(void) 27758 { 27759 spin_lock(&scheduler_lock); 27760 spin_lock_irq(&runqueue_lock); 27761 if (current->policy == SCHED_OTHER) 27762 current->policy |= SCHED_YIELD; 27763 current->need_resched = 1; 27764 move_last_runqueue(current); 27765 spin_unlock_irq(&runqueue_lock); 27766 spin_unlock(&scheduler_lock); 27767 return 0; 27768 } 27769 27770 asmlinkage int sys_sched_get_priority_max(int policy) 27771 { 27772 int ret = -EINVAL; 27773 27774 switch (policy) { 27775 case SCHED_FIFO: 27776 case SCHED_RR: 27777 ret = 99; 27778 break; 27779 case SCHED_OTHER: 27780 ret = 0; 27781 break; 27782 } 27783 return ret; 27784 } 27785 27786 asmlinkage int sys_sched_get_priority_min(int policy) 27787 { 27788 int ret = -EINVAL; 27789 27790 switch (policy) { 27791 case SCHED_FIFO: 27792 case SCHED_RR: 27793 ret = 1; 27794 break; 27795 case SCHED_OTHER: 27796 ret = 0; 27797 } 27798 return ret; 27799 } 27800 27801 asmlinkage int sys_sched_rr_get_interval(pid_t pid, 27802 struct timespec *interval) 27803 { 27804 struct timespec t; 27805 27806 t.tv_sec = 0; 27807 t.tv_nsec = 150000; 27808 if (copy_to_user(interval, &t, 27809 sizeof(struct timespec))) 27810 return -EFAULT; 27811 return 0; 27812 } 27813 27814 asmlinkage int sys_nanosleep(struct timespec *rqtp, 27815 struct timespec *rmtp) 27816 { 27817 struct timespec t; 27818 unsigned long expire; 27819 27820 if (copy_from_user(&t, rqtp, sizeof(struct timespec))) 27821 return -EFAULT; 27822 27823 if (t.tv_nsec >= 1000000000L t.tv_nsec < 0 27824 t.tv_sec < 0) 27825 return -EINVAL; 27826 27827 27828 if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && 27829 current->policy != SCHED_OTHER) 27830 { 27831 /* Short delay requests up to 2 ms will be handled 27832 * with high precision by a busy wait for all 27833 * real-time processes. 27834 * 27835 * It's important on SMP not to do this holding 27836 * locks. */ 27837 udelay((t.tv_nsec + 999) / 1000); 27838 return 0; 27839 } 27840 27841 expire = 27842 timespec_to_jiffies(&t) + (t.tv_sec t.tv_nsec); 27843 27844 current->state = TASK_INTERRUPTIBLE; 27845 expire = schedule_timeout(expire); 27846 27847 if (expire) { 27848 if (rmtp) { 27849 jiffies_to_timespec(expire, &t); 27850 if (copy_to_user(rmtp, &t,sizeof(struct timespec))) 27851 return -EFAULT; 27852 } 27853 return -EINTR; 27854 } 27855 return 0; 27856 } 27857 27858 static void show_task(int nr,struct task_struct * p) 27859 { 27860 unsigned long free = 0; 27861 int state; 27862 static const char * stat_nam[] = 27863 { "R", "S", "D", "Z", "T", "W" }; 27864 27865 printk("%-8s %3d ", 27866 p->comm, (p == current) ? -nr : nr); 27867 state = p->state ? ffz(~p->state) + 1 : 0; 27868 if (((unsigned) state) < 27869 sizeof(stat_nam)/sizeof(char *)) 27870 printk(stat_nam[state]); 27871 else 27872 printk(" "); 27873 #if (BITS_PER_LONG == 32) 27874 if (p == current) 27875 printk(" current "); 27876 else 27877 printk(" %08lX ", thread_saved_pc(&p->tss)); 27878 #else 27879 if (p == current) 27880 printk(" current task "); 27881 else 27882 printk(" %016lx ", thread_saved_pc(&p->tss)); 27883 #endif 27884 { 27885 unsigned long * n = (unsigned long *) (p+1); 27886 while (!*n) 27887 n++; 27888 free = (unsigned long) n - (unsigned long)(p+1); 27889 } 27890 printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid); 27891 if (p->p_cptr) 27892 printk("%5d ", p->p_cptr->pid); 27893 else 27894 printk(" "); 27895 if (p->p_ysptr) 27896 printk("%7d", p->p_ysptr->pid); 27897 else 27898 printk(" "); 27899 if (p->p_osptr) 27900 printk(" %5d\n", p->p_osptr->pid); 27901 else 27902 printk("\n"); 27903 27904 { 27905 struct signal_queue *q; 27906 char s[sizeof(sigset_t)*2+1],b[sizeof(sigset_t)*2+1]; 27907 27908 render_sigset_t(&p->signal, s); 27909 render_sigset_t(&p->blocked, b); 27910 printk(" sig: %d %s %s :", 27911 signal_pending(p), s, b); 27912 for (q = p->sigqueue; q ; q = q->next) 27913 printk(" %d", q->info.si_signo); 27914 printk(" X\n"); 27915 } 27916 } 27917 27918 char * render_sigset_t(sigset_t *set, char *buffer) 27919 { 27920 int i = _NSIG, x; 27921 do { 27922 i -= 4, x = 0; 27923 if (sigismember(set, i+1)) x |= 1; 27924 if (sigismember(set, i+2)) x |= 2; 27925 if (sigismember(set, i+3)) x |= 4; 27926 if (sigismember(set, i+4)) x |= 8; 27927 *buffer++ = (x < 10 ? '0' : 'a' - 10) + x; 27928 } while (i >= 4); 27929 *buffer = 0; 27930 return buffer; 27931 } 27932 27933 void show_state(void) 27934 { 27935 struct task_struct *p; 27936 27937 #if (BITS_PER_LONG == 32) 27938 printk("\n" 27939 " free " 27940 " sibling\n"); 27941 printk(" task PC stack pid father " 27942 "child younger older\n"); 27943 #else 27944 printk("\n" 27945 " free " 27946 " sibling\n"); 27947 printk(" task PC stack pid " 27948 "father child younger older\n"); 27949 #endif 27950 read_lock(&tasklist_lock); 27951 for_each_task(p) 27952 show_task((p->tarray_ptr - &task[0]),p); 27953 read_unlock(&tasklist_lock); 27954 } 27955 27956 void __init sched_init(void) 27957 { 27958 /* We have to do a little magic to get the first 27959 * process right in SMP mode. */ 27960 int cpu=hard_smp_processor_id(); 27961 int nr = NR_TASKS; 27962 27963 init_task.processor=cpu; 27964 27965 /* Init task array free list and pidhash table. */ 27966 while(--nr > 0) 27967 add_free_taskslot(&task[nr]); 27968 27969 for(nr = 0; nr < PIDHASH_SZ; nr++) 27970 pidhash[nr] = NULL; 27971 27972 init_bh(TIMER_BH, timer_bh); 27973 init_bh(TQUEUE_BH, tqueue_bh); 27974 init_bh(IMMEDIATE_BH, immediate_bh); 27975 }

Содержание раздела