#!/usr/bin/env python "Builds INDEX for the ports tree. " # Takes around 24 minutes on a P4 3Ghz, for 16000 ports, and slowly grows to # 20 Megs, with a spike at 30 megs for a few seconds at the end. The total # running time of python is 2 minutes, and the running time of the # "processing" steps, recurse and sort_dag is 4 seconds only, so programming # this in C would be a perfect loss of time. Running the command "make # INDEXJOBS=3 index" takes 17 minutes, and shows a 188% CPU utilisation. The # INDEX ends up in the current directory. It is topologically sorted, # dependencies before dependants. Loops are detected. # Author: Michel Talon # Version 1.2 # Licence: BSD (revised) # Date: November 2006 import os, sys from time import sleep, time, strftime, localtime import thread # Variables to be user customized portdir = "/usr/ports/" # can be replaced at will nwork = 2 # Number of worker threads delay = 0.055 index = open('INDEX','w') # Can be replaced by /usr/ports/INDEX-6 at will. errorlog = open('ErrorLog','w') index_pristine = True # index_pristine helps cure the problem that some ports get package names # depending on local settings, such as gimp-gnome-2.2.10_1,1 while # official packages and INDEX get gimp-2.2.10_1,1 This hack comes # from /usr/ports/Makefile. # A sanity check if portdir.find('/', -1) == -1 : print "portdir needs a trailing / " sys.exit() ###### Some preparation ################# # We store relevant information about a port into an associative array, # indexed by the "origin" of the port. The corresponding value is an array # containing the various dependencies. So each key represents a node in the # DAG of dependency tree. port2pkg = {} # We prepare a shell script to run make -V in each port and send output to a # file. # Motivation: see "describe" target in bsd.port.mk. # www-site is obtained by grepping the description file. PORTSDIR = portdir.rstrip('/') # portdir independence if index_pristine : # the "pristine" hack LOCALBASE = "/nonexistentlocal" X11BASE = "/nonexistentx" else : LOCALBASE = "/usr/local" X11BASE = "/usr/X11R6" makestring = '''/usr/bin/make \ PORTSDIR=''' + PORTSDIR + ''' \ LOCALBASE=''' + LOCALBASE + ''' \ X11BASE=''' + X11BASE + ''' \ -V PKGNAME \ -V PREFIX \ -V COMMENT \ -V DESCR \ -V MAINTAINER \ -V CATEGORIES \ -V EXTRACT_DEPENDS \ -V PATCH_DEPENDS \ -V FETCH_DEPENDS \ -V BUILD_DEPENDS \ -V RUN_DEPENDS \ -V LIB_DEPENDS ''' ###### Now we are setup ################## ###### First some helper functions ####### def get_valid_categories() : "Hack to get a list of valid categories." # This works only in a port directory, so choose one. pipe = os.popen("cd " + portdir + "/ftp/wget && make \ -V VALID_CATEGORIES") valid = pipe.readline().split() pipe.close() return valid valid_categories = get_valid_categories() # Compute this only once def check_cat(mylist, orig) : "Check that a category is valid." for entry in mylist: if entry not in valid_categories : mylist.remove(entry) print >> errorlog, "Bad category " + entry + " in " + orig return mylist def clean_path(path) : '''We strip paths of dependencies of portdir which needlessly uses space and introduces explicit dependency on portdir. We also normalize paths which use .. to move in the ports tree. The script make_index in Tools does the same.''' if path.find("..") >= 0 : # Perhaps useful to keep track of them pipe = os.popen("realpath " + path) path = pipe.readline() pipe.close() # Experience shows some dependencies have trailing / or \n return path.replace(portdir, '', 1).strip().rstrip('/') def union(list1, list2) : "Set theoretic union of two lists." # Using sets would be faster but this is insignificant. for elem in list2 : if elem not in list1 : list1.append(elem) def get_website(description) : "Parse description file to get website." website = "" for line in file(portdir + description) : if not line.find('WWW:', 0, 5) : website = line.replace('WWW:', '').strip() return website # returns the last occurence of a line WWW: or "" # the first occurence has proven problematic ##### Obtain the port information ######## def get_port_info(orig) : "Obtains one port information running make -V." pipe = os.popen("cd " + portdir + orig + " && " + makestring) sleep(delay) # Yields to the other threads info = pipe.read() pipe.close() buff = info.split('\n') # Parse the contents of the buffer description = clean_path(buff[3]) prefix = buff[1] # We must restore the fake prefix introduced by index_pristine = True if index_pristine : if prefix == '/nonexistentx' : prefix = '/usr/X11R6' if prefix == '/nonexistentlocal' : prefix = '/usr/local' # The disposition of the fields is as in INDEX file. pkg_info = [buff[0], # 0 pkgname False, # 1 expanded? prefix, # 2 prefix buff[2], # 3 comment description, # 4 description buff[4], # 5 maintainer check_cat(buff[5].split(), orig), # 6 categories [clean_path(deps.split(':')[1]) for deps in buff[9].split()], # 7 build_deps [clean_path(deps.split(':')[1]) for deps in buff[10].split()], # 8 run_deps get_website(description), # 9 website [clean_path(deps.split(':')[1]) for deps in buff[6].split()], # 10 extract_deps [clean_path(deps.split(':')[1]) for deps in buff[7].split()], # 11 patch_deps [clean_path(deps.split(':')[1]) for deps in buff[8].split()], # 12 fetch_deps 0 ] # 13 color # color is 0, 1, 2 or white, grey, black for DFS algorithm. lib_deps = [clean_path(deps.split(':')[1]) for deps in buff[11].split()] union(pkg_info[7], lib_deps) # build union lib union(pkg_info[8], lib_deps) # run union lib if len(pkg_info[8]) == 0 : pkg_info[1] = True # For recursive extension return pkg_info def run_col(collection) : '''Main loop for treating a collection of ports. Work is done by nwork worker threads. Synchronization is provided by an instance of Wsync class.''' # Creates a Wsync instance, with nwork threads, dowork = True worker = Wsync(nwork) # The engine, launches tokens, and starts worker threads. It runs in the # context of the main thread. delay_s = delay/nwork - 0.001 for orig in list_col(collection) : worker.engine(orig) sleep(delay_s) # Improves performance, spreads the load # Signals end of work to threads. No need to protect, only main thread # writes to it. worker.dowork = False # Blocks until last thread dies worker.finish() ################################################# # # # Recursion and sorting in the DAG # # # ################################################# # KeyError occurs when the needed dependency doesn't exist in the port # tree. For example a dependency to swig11 is recorded, but the port tree # has swig13. It may also occur if we don't have a "complete" set of ports, # that is closed under dependency. Then these KeyError should trigger # extending the set of ports. def recurse_exp(p2p) : '''Recursive expansion of run-deps. At the end we have the complete DAG of run time dependencies, we can topologically sort it in next step.''' def recurse(p2p, orig) : "The recursive step." for p_run in p2p[orig][8] : try: if not p2p[p_run][1] : # expand = False recurse(p2p, p_run) # Recurse on non expanded subdeps. union(p2p[orig][8], p2p[p_run][8]) except KeyError : # Some p_run doesn't exist. print >> errorlog, "KeyError in recurse", p_run p2p[orig][1] = True # Now do it for orig in p2p.keys() : # recursive expansion of p2p if not p2p[orig][1] : # orig unexpanded recurse(p2p, orig) def emit_ordered(p2p, orig) : # Helper function "Fixes expansions and writes index line." def convert(list0) : # Helper function "Convert a list of dependencies to a string of sorted package names." nlist = [] for elem in list0 : try: nlist.append(p2p[elem][0]) except KeyError : # no elem in port tree print >> errorlog, "KeyError in convert", elem nlist.sort() return " ".join(nlist) # returns a string from list. # We profit of the visit to fix the other dependencies. try: # Missing dependencies? Rather safe than sorry. for p_build in p2p[orig][7] : union(p2p[orig][7], p2p[p_build][8]) for p_ext in p2p[orig][10] : union(p2p[orig][10], p2p[p_ext][8]) for p_pat in p2p[orig][11] : union(p2p[orig][11], p2p[p_pat][8]) for p_fet in p2p[orig][12] : union(p2p[orig][12], p2p[p_fet][8]) except KeyError, reason : # Some p_build, etc. doesn't exist. print >> errorlog, "KeyError in expand", reason line = "|".join([p2p[orig][0], portdir+orig, p2p[orig][2].rstrip('/'), p2p[orig][3], portdir+p2p[orig][4], p2p[orig][5], " ".join(p2p[orig][6]), convert(p2p[orig][7]), convert(p2p[orig][8]), p2p[orig][9], convert(p2p[orig][10]), convert(p2p[orig][11]), convert(p2p[orig][12])]) print >> index, line # Fills the INDEX file, in sorted order. # We visit each node of the DAG in depth first order, algorithm DFS in # Cormen, Leiserson, Rivest and Stein "Introduction to algorithms." # Each node visited has its other fields expanded. # Algorithm is: from a node visit another one if it is white. Immediately # paint it grey and recurse in depth. When returning from recursion, paint it # black and emit it. In fact the nodes so emitted are topologically sorted. # The theory also shows that loops in the dag are trivially detected. def sort_dag(p2p) : "Expand run-deps in other fields, and topological sort of DAG." for orig in p2p.keys() : # Find a new node to visit if p2p[orig][13] == 0 : # White ? dfs_visit(p2p, orig) def dfs_visit(p2p, orig) : "The DFS recursive step. " # Visit in depth first order. The algorithm emits ordered nodes and # detects eventual cycles. p2p[orig][13] = 1 # Paint grey for end in p2p[orig][8] : # End of arrow, run_dependency try: if p2p[end][13] == 0 : # White ? dfs_visit(p2p, end) # Iterate in depth else : if p2p[end][13] == 1 : # Loop print p2p[end][0], p2p[orig][0], "are looping." except KeyError : # Key doesn't exist print >> errorlog, "KeyError in dfs_visit", end p2p[orig][13] = 2 # On return paint black and emit index line # Emit the black node. The DFS theory shows they will be sorted emit_ordered(p2p, orig) # One can assign order to nodes by replacing this by p2p[orig][13] = counter # and counter += 1 . ############################################################################### # # # Next is an implementation of threaded execution of "make -V" since python # # code is not able to run concurrently on several processors, but forked # # external programs can. To avoid the large overhead of the python # # "threading" library, we use only the low level "thread" library, and # # construct our own synchronization from that. # # # ############################################################################### class Wsync : "Workers synchronize. Collects threads stuff in a clean place." def __init__(self, nwork) : "Variables needed by our synchronization procedure." self.njobs = 0 self.nthr = nwork self.token = 0 self.dowork = True self.order = 0 self.nwork = nwork # Number of worker threads self.eng_l = thread.allocate_lock() # To control token engine loop self.tok_l = thread.allocate_lock() # To protect njobs and token self.exit_l = thread.allocate_lock() # To protect nthr at the end self.dag_l = thread.allocate_lock() # To protect writing to DAG self.eng_l.acquire() # Must be initialized to locked def work(self) : "Worker function, launching the make -V command." while self.dowork : self.tok_l.acquire() if self.njobs > 0 : # One should have njobs > 0 here, since threads are # launched after tokens orig = self.token self.njobs -= 1 if self.eng_l.locked() : self.eng_l.release() self.tok_l.release() pkg_info = get_port_info(orig) # Do real work self.dag_l.acquire() port2pkg[orig] = pkg_info # Put in global storage self.dag_l.release() else : # njobs = 0 strange! self.tok_l.release() sleep(0.001) # Yields to another thread # Cleanup at the end self.tok_l.acquire() if self.njobs > 0 : # Eventually picks last job orig = self.token self.njobs -= 1 self.tok_l.release() get_port_info(orig) self.dag_l.acquire() port2pkg[orig] = pkg_info self.dag_l.release() else : self.tok_l.release() self.exit_l.acquire() # Signals end of thread self.nthr -= 1 self.exit_l.release() def finish(self) : "Main thread waits for end of all worker threads." get_out = False while not get_out : self.exit_l.acquire() if self.nthr > 0 : self.exit_l.release() else : get_out = True # Exits self.exit_l.release() def engine(self, orig) : "Distributes tokens, here parameter orig." self.tok_l.acquire() self.token = orig self.njobs += 1 self.tok_l.release() if self.order < self.nwork : # Launch nwork threads thread.start_new_thread(self.work, ()) self.order += 1 self.eng_l.acquire() # Blocks engine until token acquired ################################################################## # We explore the ports tree, determine collections of ports, # # and apply run_col() to them to build the INDEX. # ################################################################## # Functions to explore the ports tree. We try to be resiliant to refuse files # here, to obsoleted ports directories, etc. # We want to get the list of subdirs of the main port directory # which are effectively present on the machine. Some of them # may have been removed via refuse files. Each one is a collection of ports, # tops() returns all collections. def tops(): "The directories under portdir." subdirs = os.listdir(portdir) pipe = os.popen("cd " + portdir + " && make -V SUBDIR") subd = pipe.readline().split() pipe.close() collections = [] for entry in subdirs: if entry in subd: collections.append(entry) return collections # For each collection we want to enter each port under it. Some # of these ports may be empty, they don't have makefile. Moreover # they should not be listed in subdir/Makefile. Also non directory # files like Makefile and README.html must be omitted. def list_col(collection) : "Lists the valid ports under a collection." valid_p = [] thisdir = portdir + collection pipe = os.popen("cd " + thisdir + " && make -V SUBDIR") subd = pipe.readline().split() pipe.close() for port_name in os.listdir(thisdir) : if port_name in ['Makefile', 'Makefile.inc', 'README.html']: pass elif port_name in subd: orig = collection + "/" + port_name valid_p.append(orig) else: print >> errorlog, "The port \ " + collection + "/" + port_name + " is obsolete" return valid_p # This happens when port_names have been removed from the ports system and purged # from the collection Makefile, but the corresponding directory has not been # removed because it contains README.html def run() : "This drives the index build for the ports system." t_start = time() for collection in tops(): # The long step, exploring the ports tree. print "......" + collection + "......" run_col(collection) recurse_exp(port2pkg) # expand run_deps sort_dag(port2pkg) # Topological sorts the dag and print INDEX time_string = strftime("%M minutes %S seconds.", localtime(time() - t_start)) print "Total time spent: ", time_string errorlog.close() if __name__ == '__main__' : run() print '''You will find the generated index in INDEX and the errors encountered in ErrorLog.''' # To find duplicates just run # awk -F '|' '{dupl[$1]++ ; if (dupl[$1] >= 2) print $1}' INDEX